# April 2025
# A short script to count how frequent are occupationnal changes for individuals
# To help answer a question from JOLE review of second version

library(data.table)
library(magrittr)
library(arrow)

source("AKMland_functions.R")

# filenames for reading the estimates file in .parquet
period1 <- paste(saverep, "20022007_two_methods.parquet", sep="")
period2 <- paste(saverep, "20082013_two_methods.parquet", sep="")
period3 <- paste(saverep, "20142019_two_methods.parquet", sep="")

# Number of occ transitions per period
results <- list()
for (p in c(period1, period2, period3)){
  tic("get occ transitions for one period")
  d <- data.table(read_parquet(p))
  docc <- d[,.(occ_transition:=uniqueN(occ)-1), by=indiv_id]
  n_occ_movers <- nrow(docc[occ_transition>0,])
  n_transition <- sum(docc$occ_transition)
  r <- list(n_occ_movers, n_transition)
  names(r) <- c("n_indiv_with_occ_transitions", "n_occ_transitions")
  results <- rbind(results, r)
  toc()
}

savn <- paste(saverep,"occup_changes_three_periods.csv", sep="")
write.csv(results, file=savn)

# Number of occ transitions (and firm movers) per two-years panels
# To loop over all two-years panels we need a full 2002-2019 dataset:
p1 <- read_parquet(period1)
p1 <- p1[, .(indiv_id, year, occ, firm_id)]

p2 <- read_parquet(period2)
p2 <- p2[, .(indiv_id, year, occ, firm_id)]

p3 <- read_parquet(period3)
p3 <- p3[, .(indiv_id, year, occ, firm_id)]

p <- rbind(p1, p2)
p <- rbind(p, p3)

rm(p1)
rm(p2)
rm(p3)
gc()

# loading PCS codes
pcs <- read.csv2(paste(nomenclatures, "Nomenclature_N2_PCS2020.csv", sep=""))
names(pcs) <- c("code","label","label_short")


# A function computing occupation movers and firm movers a bit faster
faster_count_occ <- function(d, y){
  # the function takes a full two years panel d, counts the number of occupations per individual
  # and produces descriptive statistics
  tic("Counting years, occupations and firms per individual")
  yearmin <- y
  nobs <- nrow(d)
  d <- d[, nyear:=uniqueN(year), by=indiv_id]
  noneobs <- nrow(d[nyear==1,])
  
  # discarding individuals with only one observation
  d <- d[nyear==2,]
  d <- d[, .(indiv_id, year, occ, firm_id)]
  nusefulobs <- nrow(d)
  
  # puttind d in wide format
  dt <- reshape(d, timevar = "year", 
                   idvar = "indiv_id", 
                   direction = "wide")
  names(dt) <- c("indiv_id", "occ.1", "firm_id.1", "occ.2", "firm_id.2")
  toc() 
 
  tic("summarizing number of firms and occupations per individuals")
  nindiv <- nrow(dt)
  nmovers <- nrow(dt[firm_id.1!=firm_id.2,])
  #discarding individuals with missing value for occupation at least one year :
  dt <- dt[(occ.1 %in% pcs$code)&(occ.2 %in% pcs$code),]
  nindiv_occ <- nrow(dt)
  nocc_transitions <- nrow(dt[occ.1!=occ.2,])
  nmovers_occ <- nrow(dt[firm_id.1!=firm_id.2,])
  t <- list(year, nobs, noneobs, 
            nusefulobs, nindiv, nmovers, nindiv_occ,
            nocc_transitions, nmovers_occ)
  names(t) <- c("yearmin", "n_obs", "one_year_indiv",
                "useful_obs", "two_years_indiv", "movers", "two_years_indiv_with_occ", 
                "occ_transitions", "movers_with_occ") 
  
  toc()
  return(t)
}

# Now we use this function

# We compare firm mobility and occupational transition between
# year one and year two over all two-years panels
# Of course the two-years span means that an individual having occupations
# A - B - A will count for two occupational transitions 
# (and not two unique occupations and just one transition)

results <- list()
for (y in 2002:2018){
  print(y)
  d <- p[(year==y)|(year==y+1),]
  occ_c <- faster_count_occ(d, y=y)
  savn <- paste(saverep,"occup_changes_2years_periods_", y, ".csv", sep="")
  write.csv(occ_c, file=savn)
  results <- rbind(results, occ_c)
}

savn <- paste(saverep,"occup_changes_all_2years.csv", sep="")
write.csv(results, file=savn)
